This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(lme4)
Loading required package: Matrix
library(ggplot2)
library(plotrix)
library(visreg)
library(stringr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ tibble  3.1.0     ✓ purrr   0.3.4
✓ tidyr   1.1.2     ✓ dplyr   1.0.4
✓ readr   1.4.0     ✓ forcats 0.5.0
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
x tidyr::pack()   masks Matrix::pack()
x tidyr::unpack() masks Matrix::unpack()
library(ggpubr)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 methods overwritten by 'car':
  method                          from
  influence.merMod                lme4
  cooks.distance.influence.merMod lme4
  dfbeta.influence.merMod         lme4
  dfbetas.influence.merMod        lme4
library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine
library(tidyverse)
library(tidymodels)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.2 ──
✓ broom     0.7.3      ✓ recipes   0.1.15
✓ dials     0.0.9      ✓ rsample   0.0.9 
✓ infer     0.5.4      ✓ tune      0.1.3 
✓ modeldata 0.1.0      ✓ workflows 0.2.2 
✓ parsnip   0.1.5      ✓ yardstick 0.0.8 
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
x gridExtra::combine() masks dplyr::combine()
x scales::discard()    masks purrr::discard()
x tidyr::expand()      masks Matrix::expand()
x dplyr::filter()      masks stats::filter()
x recipes::fixed()     masks stringr::fixed()
x dplyr::lag()         masks stats::lag()
x tidyr::pack()        masks Matrix::pack()
x yardstick::spec()    masks readr::spec()
x recipes::step()      masks stats::step()
x tidyr::unpack()      masks Matrix::unpack()
x recipes::update()    masks Matrix::update(), stats::update()
library(vip) 

Attaching package: ‘vip’

The following object is masked from ‘package:utils’:

    vi
library(plm)

Attaching package: ‘plm’

The following objects are masked from ‘package:dplyr’:

    between, lag, lead
lasso_output_data <- read_csv("/Users/noa/Workspace/lasso_positions_sampling_results/new_spr_results.csv")

── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  Lasso_folder = col_character(),
  RAxML_baseline_run_prefix = col_character(),
  SPR_search_starting_tree_ll = col_logical(),
  SPR_search_starting_tree_newick = col_character(),
  alphas = col_character(),
  alternative_files_folder = col_character(),
  brlen_generator = col_character(),
  curr_job_folder = col_character(),
  curr_msa_version_folder = col_character(),
  dataset_id = col_character(),
  file_name = col_character(),
  file_type = col_character(),
  file_type_biopython = col_character(),
  jobs_prefix = col_character(),
  lasso_SPR_first_phase_tree_newick = col_character(),
  lasso_SPR_second_phase_tree_newick = col_character(),
  lasso_SPR_starting_tree_path = col_character(),
  lasso_baseline_run_prefix = col_character(),
  lasso_ll_per_iteration_first_phase = col_character(),
  lasso_ll_per_iteration_second_phase = col_character()
  # ... with 21 more columns
)
ℹ Use `spec()` for the full column specifications.
data_source <- read_csv("/Users/noa/Workspace/data/sampled_datasets.csv")
Missing column names filled in: 'X1' [1]
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  X1 = col_double(),
  path = col_character(),
  db = col_character(),
  ntaxa = col_double(),
  nchars = col_double()
)
lasso_output_data<-lasso_output_data%>%mutate(relative_path = str_replace_all(dataset_id,'(/groups/pupko/noaeker/data/ABC_DR/)|(/ref_msa.aa.phy)',""))
spr_data= merge(lasso_output_data,data_source,by.x="relative_path",by.y="path", all.x = TRUE)

if (nrow(spr_data)<nrow(lasso_output_data))
{print("Problem in matching path names")
  cat("nrow data=",nrow(spr_data), "nrow lasso=",nrow(lasso_output_data))
}
spr_data <- spr_data %>% mutate(sample_pct_rd = floor(as.integer(sample_pct*100)/5)*(5/100), delta_ll_first_phase = ifelse(rf_dist_first_phase>0,naive_SPR_ll-lasso_SPR_first_phase_ll,0), delta_ll_second_phase = ifelse(rf_dist_second_phase>0,naive_SPR_ll-lasso_SPR_second_phase_ll,0)) %>% filter(n_seq>=5)
spr_data %>% distinct(db,dataset_id) %>% group_by(db)%>% summarize(n=n())

plot for the example MSA

example_msa_data<- spr_data %>% filter (relative_path=="Selectome/Euteleostomi/ENSGT00660000095541", actucal_training_size==800)

example_msa_ll<- read_csv("/Users/noa/Workspace/lasso_positions_sampling_results/test_sitelh_df_prediction_grant.csv")
Missing column names filled in: 'X1' [1]
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  X1 = col_double(),
  predicted_test_ll = col_double(),
  true_test_ll = col_double()
)
g00<-ggplot(data=example_msa_data, aes(x=sample_pct, y=1-`lasso_test_R^2`)) +theme_classic()+
geom_line() +geom_point()+labs(title="")+xlab("Percentage of chosen positions")+ ylab("unexplained variance (%)") + theme(plot.title = element_text(hjust = 0.5))+scale_x_continuous(labels = scales::percent)+scale_y_continuous(labels = scales::percent)+ theme(plot.title = element_text(hjust = 0.5))+theme(axis.text=element_text(size=11),
        axis.title=element_text(size=11))

g01<-ggplot(example_msa_ll,aes(true_test_ll, predicted_test_ll)) +theme_classic()+
  geom_smooth(method='lm')+xlab("True LL")+ylab("Predicted LL ")+geom_point()+ labs(title="")+theme(plot.title = element_text(hjust = 0.5))+ theme(plot.title = element_text(hjust = 0.5))+theme(axis.text=element_text(size=11),
        axis.title=element_text(size=11))


# g02<-ggplot(data=example_MSA_data_lasso, aes(x=actucal_training_size,color=brlen_generator, y=sample_pct))+theme_classic()+ labs(color = "Branch length distribution")+
#      geom_line() +geom_point()+
#    xlab("Training size")+ylab("Positions (%)")+ #theme(axis.title=element_text(size=10),)
#   guides(fill=guide_legend(title="Branch length distribution"))+ggtitle("Median percentage of positions chosen by Lasso")+scale_y_continuous(labels = scales::percent)+ theme(legend.position = "none")+ theme(plot.title = element_text(hjust = 0.5))+labs(color = "Branch length distribution")
# 
ggarrange(g00, g01, hjust=-1,vjust=1, heights=c(2,2),common.legend=TRUE,
          labels = c("A", "B"),
          ncol = 1, nrow =2)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

General histograms

spr_data_per_dataset <- spr_data %>% group_by(relative_path, gap_pct, divergence, constant_sites_pct, avg_entropy, n_seq, n_loci) %>%
  summarise(best_delta_ll =  min(delta_ll_first_phase))
`summarise()` has grouped output by 'relative_path', 'gap_pct', 'divergence', 'constant_sites_pct', 'avg_entropy', 'n_seq'. You can override using the `.groups` argument.
ggplot(data = spr_data_per_dataset, aes(n_seq)) +geom_histogram()

ggplot(data = spr_data_per_dataset, aes(n_loci)) +geom_histogram()

#ggplot(data = spr_data_per_dataset, aes(x=gap_pct)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=divergence)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=constant_sites_pct)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=avg_entropy)) +geom_histogram()
ggplot(data = spr_data_per_dataset, aes(x=best_delta_ll)) +geom_histogram()

NA
NA

Find datasets in which a better tree was found

good_datasets<-spr_data %>% filter(delta_ll_first_phase<0) %>% distinct(relative_path) %>% pull(relative_path)
good_datasets
[1] "Selectome/Euteleostomi/ENSGT00610000085715"
spr_data %>% filter(relative_path %in% good_datasets ) %>% dplyr::select(relative_path,n_seq, gap_pct,constant_sites_pct,divergence,avg_entropy, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll, naive_SPR_spr_moves)
good_datasets
[1] "Selectome/Euteleostomi/ENSGT00610000085715"

Find datasets in which a much worse tree was found

problematic_datasets<-spr_data %>% filter(delta_ll_first_phase>500) %>% distinct(relative_path) %>% pull(relative_path)
problematic_datasets
[1] "Selectome/Euteleostomi/ENSGT00530000063613"
spr_data  %>% filter(relative_path %in% problematic_datasets) %>% dplyr::select(relative_path,n_seq, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll, naive_SPR_spr_moves)
NA
NA
NA

Find datasets in which delta ll on second phase is greater than zero

not_best_final_tree_datasets<-spr_data %>% filter(delta_ll_second_phase>0) %>% distinct(relative_path) %>% pull(relative_path)
problematic_datasets
[1] "Selectome/Euteleostomi/ENSGT00530000063613"
spr_data  %>% filter(relative_path %in% not_best_final_tree_datasets) %>% dplyr::select(relative_path,n_seq, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,delta_ll_second_phase,rf_dist_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll,rf_dist_second_phase, naive_SPR_spr_moves)
NA
NA

Accuracy metrics as a function of training size and sample percentage



#spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(paste(actucal_training_size,"\n",sample_pct_rd)),y #=lasso_test_spearmanr )) + geom_boxplot() + labs(x="training_size\nsmple_pct")

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =delta_ll_first_phase, fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =lasso_test_spearmanr,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =`lasso_test_R^2`,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =`R^2_pearson_during_tree_search`,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)

NA
NA
NA
NA
NA

minimal training size required for best accuracy + minimum sample percentage required for best accuracy vs. sample percentage

spr_data_ok <- spr_data# %>%   filter(!(relative_path %in% problematic_datasets))
spr_data_ok_best_rows<- spr_data_ok %>% group_by(relative_path) %>%  summarise (delta_ll_first_phase =min(delta_ll_first_phase)) 
spr_full_data_best_rows<-spr_data_ok_best_data <- spr_data_ok %>% semi_join(spr_data_ok_best_rows) 
Joining, by = c("relative_path", "delta_ll_first_phase")
spr_data_ok_best_data <- spr_full_data_best_rows %>% group_by(relative_path,n_loci, delta_ll_first_phase, n_seq,avg_entropy, gap_pct, constant_sites_pct) 
min_sample_pct_data <- spr_data_ok_best_data %>% summarise(min_sample_pct = min(sample_pct_rd ))
`summarise()` has grouped output by 'relative_path', 'n_loci', 'delta_ll_first_phase', 'n_seq', 'avg_entropy', 'gap_pct'. You can override using the `.groups` argument.
min_sample_pct_data %>% ggplot(aes(x=as.factor(n_seq),y=min_sample_pct)) + scale_y_continuous(labels = scales::percent)+ geom_boxplot()

min_training_size_data <- spr_data_ok_best_data %>% summarise(min_training_size = min(actucal_training_size))
`summarise()` has grouped output by 'relative_path', 'n_loci', 'delta_ll_first_phase', 'n_seq', 'avg_entropy', 'gap_pct'. You can override using the `.groups` argument.
min_training_size_data %>% ggplot(aes(x=as.factor(n_seq),y=min_training_size )) + geom_boxplot()


#test.training.size<-spr_data_ok_best_data %>% summarise(min_training_size = min(actucal_training_size))  
#test.sample.pct<-spr_data_ok_best_data %>% summarise(min_sample_pct = min(sample_pct))
#lm.training.size<-lm(log(min_training_size)~log(n_loci)+n_seq+avg_entropy+ gap_pct+constant_sites_pct, data=test.training.size)
#lm.sample.pct<-lm(log(min_sample_pct)~log(n_loci)+n_seq+avg_entropy+ gap_pct+constant_sites_pct, data=test.sample.pct)

#summary(lm.training.size)
#summary(lm.sample.pct)
spr_data_ok_best_rows
spr_full_data_best_rows %>% dplyr::select(relative_path,n_loci, actucal_training_size, sample_pct_rd,delta_ll_first_phase)

Focusing on datasets with 15 sequences, what is the best cuttoff?


spr_data %>%filter(n_seq==15)%>% group_by(actucal_training_size, sample_pct_rd) %>% summarize(pct_identical = sum(rf_dist_first_phase==0)/sum(rf_dist_first_phase==0 | rf_dist_first_phase>0), identical_topo= sum(rf_dist_first_phase==0), diff_topo = sum(rf_dist_first_phase>0), max_rf = max(rf_dist_first_phase), max_delta_ll= max(delta_ll_first_phase),median_delta_ll= median(delta_ll_first_phase[rf_dist_first_phase>0]),mean_delta_ll = mean(delta_ll_first_phase[rf_dist_first_phase>0]), max_spr_dist = max(lasso_SPR_second_phase_spr_moves), )
`summarise()` has grouped output by 'actucal_training_size'. You can override using the `.groups` argument.
spr_data %>%filter(n_seq==15) %>% filter(!(relative_path %in% problematic_datasets)) %>% filter(rf_dist_first_phase>0) %>% ggplot(aes(x=as.factor(sample_pct_rd),y= delta_ll_first_phase)) + geom_boxplot() + facet_wrap(~as.factor(actucal_training_size))

NA
NA
NA
NA

Focusing on datasets with less than 15 sequences, what is the best cuttoff?


spr_data %>%filter(n_seq<15)%>% group_by(actucal_training_size, sample_pct_rd) %>% summarize(pct_identical = sum(rf_dist_first_phase==0)/sum(rf_dist_first_phase==0 | rf_dist_first_phase>0), identical_topo= sum(rf_dist_first_phase==0), diff_topo = sum(rf_dist_first_phase>0), max_rf = max(rf_dist_first_phase), max_delta_ll= max(delta_ll_first_phase),median_delta_ll= median(delta_ll_first_phase[rf_dist_first_phase>0]),mean_delta_ll = mean(delta_ll_first_phase[rf_dist_first_phase>0]), max_spr_dist = max(lasso_SPR_second_phase_spr_moves), )
`summarise()` has grouped output by 'actucal_training_size'. You can override using the `.groups` argument.
spr_data %>%filter(n_seq<15) %>% filter(rf_dist_first_phase>0) %>% ggplot(aes(x=as.factor(sample_pct_rd),y= delta_ll_first_phase)) + geom_boxplot() +facet_wrap(~as.factor(actucal_training_size))

NA
NA
NA
NA

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`lasso_test_R^2`,y =`R^2_pearson_during_tree_search` )) +geom_point()


print(cor(spr_data$`lasso_test_R^2`, spr_data$`R^2_pearson_during_tree_search`))
[1] 0.2751959
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`lasso_test_R^2`,y = delta_ll_first_phase )) +geom_point()


print(cor(spr_data$`lasso_test_R^2`, spr_data$delta_ll_first_phase))
[1] -0.2942886
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`R^2_pearson_during_tree_search`,y = delta_ll_first_phase )) +geom_point()


print(cor(spr_data$`R^2_pearson_during_tree_search`, spr_data$delta_ll_first_phase ))
[1] -0.6809226
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x= mistake_cnt,y = delta_ll_first_phase )) +geom_point()


print(cor(spr_data$mistake_cnt, spr_data$delta_ll_first_phase ))
[1] 0.2461368

Looking at each dataset separately. How delta ll changes with training size and sample pct


for (dataset in unique(spr_data$relative_path)) {
   
print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
     
}

Looking at each dataset separately. How R2 changes with training size and sample pct

for (dataset in unique(spr_data$relative_path)) {
   
print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=`lasso_test_R^2`, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
     
}

All datasets with 15 sequences



spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==800) & (n_seq==15)) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 800")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==400) & (n_seq==15)) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 400")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==200) & n_seq==15) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 200")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==100) & n_seq==15) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 100")

NA
NA
NA

Less than 15 sequences



spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==800) & (n_seq<15)) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_point() + geom_line()+ theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 800")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==400) & (n_seq<15)) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 400")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==200) & n_seq<15) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 200")


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==100) & n_seq<15) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 100")

NA
NA
NA

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(lme4)
library(ggplot2)
library(plotrix)
library(visreg)
library(stringr)
library(tidyverse)
library(ggpubr)
library(gridExtra)
library(tidyverse)
library(tidymodels)
library(vip) 
library(plm)

```


```{r}
lasso_output_data <- read_csv("/Users/noa/Workspace/lasso_positions_sampling_results/new_spr_results.csv")
data_source <- read_csv("/Users/noa/Workspace/data/sampled_datasets.csv")
lasso_output_data<-lasso_output_data%>%mutate(relative_path = str_replace_all(dataset_id,'(/groups/pupko/noaeker/data/ABC_DR/)|(/ref_msa.aa.phy)',""))
spr_data= merge(lasso_output_data,data_source,by.x="relative_path",by.y="path", all.x = TRUE)

if (nrow(spr_data)<nrow(lasso_output_data))
{print("Problem in matching path names")
  cat("nrow data=",nrow(spr_data), "nrow lasso=",nrow(lasso_output_data))
}
```
```{r}
spr_data <- spr_data %>% mutate(sample_pct_rd = floor(as.integer(sample_pct*100)/5)*(5/100), delta_ll_first_phase = ifelse(rf_dist_first_phase>0,naive_SPR_ll-lasso_SPR_first_phase_ll,0), delta_ll_second_phase = ifelse(rf_dist_second_phase>0,naive_SPR_ll-lasso_SPR_second_phase_ll,0)) %>% filter(n_seq>=5)

```

```{r}
spr_data %>% distinct(db,dataset_id) %>% group_by(db)%>% summarize(n=n())
```

plot for the example MSA 

```{r}
example_msa_data<- spr_data %>% filter (relative_path=="Selectome/Euteleostomi/ENSGT00660000095541", actucal_training_size==800)

example_msa_ll<- read_csv("/Users/noa/Workspace/lasso_positions_sampling_results/test_sitelh_df_prediction_grant.csv")

g00<-ggplot(data=example_msa_data, aes(x=sample_pct, y=1-`lasso_test_R^2`)) +theme_classic()+
geom_line() +geom_point()+labs(title="")+xlab("Percentage of chosen positions")+ ylab("unexplained variance (%)") + theme(plot.title = element_text(hjust = 0.5))+scale_x_continuous(labels = scales::percent)+scale_y_continuous(labels = scales::percent)+ theme(plot.title = element_text(hjust = 0.5))+theme(axis.text=element_text(size=11),
        axis.title=element_text(size=11))

g01<-ggplot(example_msa_ll,aes(true_test_ll, predicted_test_ll)) +theme_classic()+
  geom_smooth(method='lm')+xlab("True LL")+ylab("Predicted LL ")+geom_point()+ labs(title="")+theme(plot.title = element_text(hjust = 0.5))+ theme(plot.title = element_text(hjust = 0.5))+theme(axis.text=element_text(size=11),
        axis.title=element_text(size=11))


# g02<-ggplot(data=example_MSA_data_lasso, aes(x=actucal_training_size,color=brlen_generator, y=sample_pct))+theme_classic()+ labs(color = "Branch length distribution")+
#      geom_line() +geom_point()+
#    xlab("Training size")+ylab("Positions (%)")+ #theme(axis.title=element_text(size=10),)
#   guides(fill=guide_legend(title="Branch length distribution"))+ggtitle("Median percentage of positions chosen by Lasso")+scale_y_continuous(labels = scales::percent)+ theme(legend.position = "none")+ theme(plot.title = element_text(hjust = 0.5))+labs(color = "Branch length distribution")
# 
ggarrange(g00, g01, hjust=-1,vjust=1, heights=c(2,2),common.legend=TRUE,
          labels = c("A", "B"),
          ncol = 1, nrow =2)




```

General histograms

```{r}
spr_data_per_dataset <- spr_data %>% group_by(relative_path, gap_pct, divergence, constant_sites_pct, avg_entropy, n_seq, n_loci) %>%
  summarise(best_delta_ll =  min(delta_ll_first_phase))

ggplot(data = spr_data_per_dataset, aes(n_seq)) +geom_histogram()
ggplot(data = spr_data_per_dataset, aes(n_loci)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=gap_pct)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=divergence)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=constant_sites_pct)) +geom_histogram()
#ggplot(data = spr_data_per_dataset, aes(x=avg_entropy)) +geom_histogram()
ggplot(data = spr_data_per_dataset, aes(x=best_delta_ll)) +geom_histogram()





```
Find datasets in which a better tree was found

```{r}
good_datasets<-spr_data %>% filter(delta_ll_first_phase<0) %>% distinct(relative_path) %>% pull(relative_path)
good_datasets
spr_data %>% filter(relative_path %in% good_datasets ) %>% dplyr::select(relative_path,n_seq, gap_pct,constant_sites_pct,divergence,avg_entropy, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll, naive_SPR_spr_moves)
good_datasets

```

Find datasets in which a much worse tree was found
```{r}
problematic_datasets<-spr_data %>% filter(delta_ll_first_phase>500) %>% distinct(relative_path) %>% pull(relative_path)
problematic_datasets
spr_data  %>% filter(relative_path %in% problematic_datasets) %>% dplyr::select(relative_path,n_seq, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll, naive_SPR_spr_moves)



```

Find datasets in which delta ll on second phase is greater than zero

```{r}
not_best_final_tree_datasets<-spr_data %>% filter(delta_ll_second_phase>0) %>% distinct(relative_path) %>% pull(relative_path)
problematic_datasets
spr_data  %>% filter(relative_path %in% not_best_final_tree_datasets) %>% dplyr::select(relative_path,n_seq, actucal_training_size,sample_pct, `R^2_pearson_during_tree_search`,delta_ll_first_phase,delta_ll_second_phase,rf_dist_first_phase,lasso_SPR_first_phase_spr_moves,lasso_SPR_second_phase_spr_moves,lasso_SPR_second_phase_ll,rf_dist_second_phase, naive_SPR_spr_moves)


```

Accuracy metrics as a function of training size and sample percentage

```{r}


#spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(paste(actucal_training_size,"\n",sample_pct_rd)),y #=lasso_test_spearmanr )) + geom_boxplot() + labs(x="training_size\nsmple_pct")

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =delta_ll_first_phase, fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =lasso_test_spearmanr,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =`lasso_test_R^2`,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)
spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=as.factor(sample_pct_rd),y =`R^2_pearson_during_tree_search`,fill = as.factor(sample_pct_rd) )) + geom_boxplot()+ labs(x="smple_pct") + facet_wrap(~actucal_training_size)





```


minimal training size required for best accuracy + minimum sample percentage required for best accuracy vs. sample percentage

```{r}
spr_data_ok <- spr_data# %>%   filter(!(relative_path %in% problematic_datasets))
spr_data_ok_best_rows<- spr_data_ok %>% group_by(relative_path) %>%  summarise (delta_ll_first_phase =min(delta_ll_first_phase)) 
spr_full_data_best_rows<-spr_data_ok_best_data <- spr_data_ok %>% semi_join(spr_data_ok_best_rows) 
spr_data_ok_best_data <- spr_full_data_best_rows %>% group_by(relative_path,n_loci, delta_ll_first_phase, n_seq,avg_entropy, gap_pct, constant_sites_pct) 
min_sample_pct_data <- spr_data_ok_best_data %>% summarise(min_sample_pct = min(sample_pct_rd ))
min_sample_pct_data %>% ggplot(aes(x=as.factor(n_seq),y=min_sample_pct)) + scale_y_continuous(labels = scales::percent)+ geom_boxplot()
min_training_size_data <- spr_data_ok_best_data %>% summarise(min_training_size = min(actucal_training_size))
min_training_size_data %>% ggplot(aes(x=as.factor(n_seq),y=min_training_size )) + geom_boxplot()

#test.training.size<-spr_data_ok_best_data %>% summarise(min_training_size = min(actucal_training_size))  
#test.sample.pct<-spr_data_ok_best_data %>% summarise(min_sample_pct = min(sample_pct))
#lm.training.size<-lm(log(min_training_size)~log(n_loci)+n_seq+avg_entropy+ gap_pct+constant_sites_pct, data=test.training.size)
#lm.sample.pct<-lm(log(min_sample_pct)~log(n_loci)+n_seq+avg_entropy+ gap_pct+constant_sites_pct, data=test.sample.pct)

#summary(lm.training.size)
#summary(lm.sample.pct)

```

```{r}
spr_data_ok_best_rows
spr_full_data_best_rows %>% dplyr::select(relative_path,n_loci, actucal_training_size, sample_pct_rd,delta_ll_first_phase)
```



Focusing on datasets with 15 sequences, what is the best cuttoff?

```{r}

spr_data %>%filter(n_seq==15)%>% group_by(actucal_training_size, sample_pct_rd) %>% summarize(pct_identical = sum(rf_dist_first_phase==0)/sum(rf_dist_first_phase==0 | rf_dist_first_phase>0), identical_topo= sum(rf_dist_first_phase==0), diff_topo = sum(rf_dist_first_phase>0), max_rf = max(rf_dist_first_phase), max_delta_ll= max(delta_ll_first_phase),median_delta_ll= median(delta_ll_first_phase[rf_dist_first_phase>0]),mean_delta_ll = mean(delta_ll_first_phase[rf_dist_first_phase>0]), max_spr_dist = max(lasso_SPR_second_phase_spr_moves), )



spr_data %>%filter(n_seq==15) %>% filter(!(relative_path %in% problematic_datasets)) %>% filter(rf_dist_first_phase>0) %>% ggplot(aes(x=as.factor(sample_pct_rd),y= delta_ll_first_phase)) + geom_boxplot() + facet_wrap(~as.factor(actucal_training_size))

spr_data %>%filter(n_seq==15) %>% filter(!(relative_path %in% problematic_datasets)) %>% filter(rf_dist_first_phase>0) %>% ggplot(aes(x=as.factor(sample_pct_rd),y= delta_ll_first_phase)) + geom_boxplot() + facet_wrap(~as.factor(actucal_training_size))




```


Focusing on datasets with less than 15 sequences, what is the best cuttoff?

```{r}

spr_data %>%filter(n_seq<15)%>% group_by(actucal_training_size, sample_pct_rd) %>% summarize(pct_identical = sum(rf_dist_first_phase==0)/sum(rf_dist_first_phase==0 | rf_dist_first_phase>0), identical_topo= sum(rf_dist_first_phase==0), diff_topo = sum(rf_dist_first_phase>0), max_rf = max(rf_dist_first_phase), max_delta_ll= max(delta_ll_first_phase),median_delta_ll= median(delta_ll_first_phase[rf_dist_first_phase>0]),mean_delta_ll = mean(delta_ll_first_phase[rf_dist_first_phase>0]), max_spr_dist = max(lasso_SPR_second_phase_spr_moves), )



spr_data %>%filter(n_seq<15) %>% filter(rf_dist_first_phase>0) %>% ggplot(aes(x=as.factor(sample_pct_rd),y= delta_ll_first_phase)) + geom_boxplot() +facet_wrap(~as.factor(actucal_training_size))




```


```{r}

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`lasso_test_R^2`,y =`R^2_pearson_during_tree_search` )) +geom_point()

print(cor(spr_data$`lasso_test_R^2`, spr_data$`R^2_pearson_during_tree_search`))

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`lasso_test_R^2`,y = delta_ll_first_phase )) +geom_point()

print(cor(spr_data$`lasso_test_R^2`, spr_data$delta_ll_first_phase))

spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x=`R^2_pearson_during_tree_search`,y = delta_ll_first_phase )) +geom_point()

print(cor(spr_data$`R^2_pearson_during_tree_search`, spr_data$delta_ll_first_phase ))


spr_data %>% filter(!relative_path %in% problematic_datasets) %>% ggplot(aes(x= mistake_cnt,y = delta_ll_first_phase )) +geom_point()

print(cor(spr_data$mistake_cnt, spr_data$delta_ll_first_phase ))
```


Looking at each dataset separately. How delta ll changes with training size and sample pct

```{r}

for (dataset in unique(spr_data$relative_path)) {
   
print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
     
}

```
Looking at each dataset separately. How R2 changes with training size and sample pct

```{r}
for (dataset in unique(spr_data$relative_path)) {
   
print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=`lasso_test_R^2`, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
     
}
```


All datasets with 15 sequences

```{r}


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==800) & (n_seq==15)) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 800")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==400) & (n_seq==15)) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 400")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==200) & n_seq==15) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 200")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==100) & n_seq==15) %>% ggplot(aes(x=sample_pct_rd,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 100")



```

Less than 15 sequences

```{r}


spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==800) & (n_seq<15)) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_point() + geom_line()+ theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 800")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==400) & (n_seq<15)) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 400")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==200) & n_seq<15) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 200")

spr_data %>%filter(!(relative_path %in% problematic_datasets) & (actucal_training_size==100) & n_seq<15) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group= relative_path, colour = relative_path)) + geom_line()+ geom_point() + theme(legend.position = "none") + scale_x_continuous(labels = scales::percent)+labs(title="training size 100")



```



```{r}
spr_data_datasets_test<- spr_data %>% filter(delta_ll_first_phase>0 & actucal_training_size==800 & sample_pct_rd==0.2)  %>% pull(relative_path)



for (dataset in spr_data_datasets_test) {
   
print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=delta_ll_first_phase, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
#print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=`lasso_test_R^2`, group = as.factor(actucal_training_size), colour = #as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
#print(spr_data %>%filter(relative_path==dataset ) %>% ggplot(aes(x=sample_pct,y=`R^2_pearson_during_tree_search`, group = as.factor(actucal_training_size), colour = as.factor(actucal_training_size))) + geom_line() + geom_point()+ scale_x_continuous(labels = scales::percent)) + labs(title=dataset)
     
}



spr_data %>% filter (relative_path %in% spr_data_datasets_test) %>% distinct (relative_path,n_loci, n_seq)
                                                                                                                                                     
```




```{r}
library(nlme)
lasso_split <- initial_split(spr_data_ok, prop = 0.75)
lasso_training <- lasso_split  %>% 
                        training()
lasso_test<- lasso_split  %>% 
                        testing()


model.exponential<-lme((delta_ll_first_phase)~actucal_training_size+ sample_pct+divergence+mad+gap_pct+number_loci_chosen+number_loci_chosen+n_loci,random=~1|relative_path, data=spr_data_ok)
plot(model.exponential)
print(summary(model.exponential))
print(intervals(model.exponential,which = "fixed"))
qqnorm(model.exponential, ~ranef(., level=1))


```


Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). 

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

